#!/usr/bin/python

import os
import re


path = '/home/jenny/htmlDownloads/temp/'
stopwordfile = '/home/jenny/htmlDownloads/temp/stopWordList.txt'

def run():
    filelist =  os.listdir(path)
    regex = re.compile(r'.*<div class="body">(.*?)</div>.*',re.DOTALL |  re.IGNORECASE)
    reg1 = re.compile(r'<\/?[ap][^>]*>',re.DOTALL |  re.IGNORECASE)
    quotereg = re.compile(r'&quot;',re.DOTALL |  re.IGNORECASE)
    puncreg = re.compile(r'[^\w]',re.DOTALL |  re.IGNORECASE)
    f = open(stopwordfile,'r')
    stopwords = f.read().lower().split()

    filewords = {}
    
    
    htmlfiles = []
    for file in filelist:
        if file[-5:] == '.html':
            htmlfiles.append(file)
    
    totalfreq = {}
    
    for file in htmlfiles:
        f = open(path+file,'r')
        words = f.read().lower()
        words = regex.findall(words)[0]
        words = quotereg.sub(' ',words)
        words = reg1.sub(' ',words)
        words = puncreg.sub(' ',words)
        words = words.strip().split()
    
        for w in stopwords:
            while w in words:
                words.remove(w)

        
        freq = {}
            
        for w in words:
            if w in freq:
                totalfreq[w] = totalfreq[w]+1
                freq[w] = freq[w]+1
            else:
                totalfreq[w] = 1
                freq[w] = 1
            
        filewords[file] = freq
        
    print filewords
    print totalfreq
if __name__ == '__main__':

    run()